Functions and imports

fileprefix = "20210717_"

ref_file_mm10 = "~/Documents/reference_data/mouse/ensembl/ensembl_gene_length_mm10.txt.gz"

# This is required to set the fonts of the paper plots to Arial
library(extrafont)
font_import(prompt=FALSE)
loadfonts()

library(RColorBrewer)
library(Seurat)
library(scater)
library(cowplot)
library(dplyr)
library(tidyr)
library(SoupX)

calc_tpm = function(dat, ref_file) {
  mm_annot = read.table(ref_file, header=T, stringsAsFactors=F, sep="\t")
  eff_length = mm_annot$eff_length[match(rownames(dat), mm_annot$mgi_symbol)]
  eff_length[!is.finite(eff_length)] = 1
  
  sce = SingleCellExperiment(assays=list(counts=as.matrix(dat), logcounts=log2(as.matrix(dat)+1)))
  tpm = calculateTPM(sce, eff_length)
  return(tpm)
}

theme_publication_plot = function(p, legend_title, legend_aes=4) {
  p = p +
  theme_cowplot() + 
  theme(axis.text = element_text(family="Arial", colour="black",size=10,face="plain"),
                       axis.title = element_text(family="Arial", colour="black",size=12,face="plain"),
                       strip.text.x = element_text(family="Arial", colour="black",size=12,face="plain", angle=90),
                       strip.text.y = element_text(family="Arial", colour="black",size=12,face="plain", angle=360),
                       strip.background = element_blank(),
                       panel.spacing.x = unit(1, "mm"),
                       panel.spacing.y = unit(3, "mm"),
                       plot.title = element_text(family="Arial", colour="black",size=12,face="plain",hjust = 0.5))
  if (!is.null(legend_title)) {
     p = p + guides(fill = guide_legend(title=legend_title,
                              override.aes = list(size=legend_aes)),
                            colour = guide_legend(title=legend_title,
                              override.aes = list(size=legend_aes)))
  }
  return(p)
}

Load the data

samplenames = c("EA_WT_1", "EA_WT_2", "EA_NOTCH1_HOM_1", "EA_NOTCH1_HOM_2")
library_names = c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2")
prefixes = c("c1_", "c2_", "s1_", "s2_")
expr_matrices = list()
# celltypes_all = list()
for (i in 1:length(samplenames)) {
  prefix = prefixes[i]
  in_path = file.path("../cellranger/", samplenames[i])
  sc = load10X(in_path)
  
  out = sc$toc
  colnames(out) = paste0(prefix, colnames(out))

  expr_matrices[[length(expr_matrices)+1]] = out
  
  # This loads a previous set of cell type assignments that was used to compare the new setup
  # celltypes = read.table(file.path("../identify_celltypes", samplenames[i], "celltypes_per_cell.txt"), header=T, stringsAsFactors=F)
  # celltypes$cell = paste0(prefix, celltypes$cell)
  # celltypes_all[[length(celltypes_all)+1]] = celltypes
}

Read in some CellRanger stats

stats = list()
for (i in 1:length(samplenames)) {
  stats_sample = read.table(file.path("../cellranger/", samplenames[i], "metrics_summary.csv"), sep=",", header=T, stringsAsFactors=F)
  stats_sample$library = samplenames[i]
  stats[[samplenames[i]]] = stats_sample
}
stats = do.call(rbind, stats)
print(stats)
Registered S3 method overwritten by 'cli':
  method     from         
  print.boxx spatstat.geom

Explore to identify filters

Here we plot a summary of general QC metrics to identify unhappy cells to be filtered

nFeature_RNA_min = 2500
nFeature_RNA_max = 6500
nCount_RNA_max = 55000
prop.mt_min = 0.03
prop.mt_max = 0.1

seu = list()
for (i in 1:length(expr_matrices)) {
  seu[[i]] = CreateSeuratObject(counts = expr_matrices[[i]], min.cells = 0, min.features = 0)
  
  mito.genes = rownames(GetAssayData(object=seu[[i]]))[grepl(pattern = "^MT-", x = toupper(rownames(GetAssayData(object=seu[[i]]))))]
  percent.mito = Matrix::colSums(GetAssayData(object=seu[[i]], slot="counts")[mito.genes, ]) / Matrix::colSums(GetAssayData(object=seu[[i]], slot="counts"))
  seu[[i]] = AddMetaData(object = seu[[i]], metadata = percent.mito, col.name = "prop.mt")
  
  # plot prop.mt vs nFeature_RNA and vs nCount_RNA
  p1 = ggplot(seu[[i]]@meta.data) + aes(x=prop.mt, y=nFeature_RNA) + geom_point(size=0.1, colour="red") + scale_y_log10() + theme_cowplot() + geom_vline(xintercept=c(prop.mt_min, prop.mt_max), linetype=2) + geom_hline(yintercept=c(nFeature_RNA_min, nFeature_RNA_max), linetype=2)
  
  p2 = ggplot(seu[[i]]@meta.data) + aes(x=prop.mt, y=nCount_RNA) + geom_point(size=0.1, colour="blue") + scale_y_log10() + theme_cowplot() + geom_vline(xintercept=c(prop.mt_min, prop.mt_max), linetype=2) + geom_hline(yintercept=c(nCount_RNA_max), linetype=2)
  
  print(plot_grid(p1, p2))
  
  p1 = VlnPlot(seu[[i]], group.by="orig.ident", features = c( "nFeature_RNA"), ncol = 1, pt.size=0) +
    geom_hline(yintercept = c(nFeature_RNA_min, nFeature_RNA_max), linetype=2) + theme(legend.position="none")
  p2 = VlnPlot(seu[[i]], group.by="orig.ident", features = c( "nCount_RNA"), ncol = 1, pt.size=0) +
    geom_hline(yintercept = c(nCount_RNA_max), linetype=2) + theme(legend.position="none")
  p3 = VlnPlot(seu[[i]], group.by="orig.ident", features = c( "prop.mt"), ncol = 1, pt.size=0) + theme(legend.position="none") + geom_hline(yintercept = c(prop.mt_max, prop.mt_min), linetype=2)
  
  # if (grepl("KO", samplenames[i], fixed=T)) {
  #   p3 = p3 + geom_hline(yintercept = c(0.1, 0.002), linetype=2)
  # } else {
  #   p3 = p3 + geom_hline(yintercept = c(0.1, 0.01), linetype=2)
  # }
  
  print(plot_grid(p1, p2, p3, ncol=3))
}

All cells

Apply batch correction - MT filters and adjust for MT, nGenes and cell cycle

seu = list()
for (i in 1:length(expr_matrices)) {
  seu[[i]] = CreateSeuratObject(counts = expr_matrices[[i]], min.cells = 30, min.features = 2500)
  
  mito.genes = rownames(GetAssayData(object=seu[[i]]))[grepl(pattern = "^MT-", x = toupper(rownames(GetAssayData(object=seu[[i]]))))]
  percent.mito = Matrix::colSums(GetAssayData(object=seu[[i]], slot="counts")[mito.genes, ]) / Matrix::colSums(GetAssayData(object=seu[[i]], slot="counts"))
  seu[[i]] = AddMetaData(object = seu[[i]], metadata = percent.mito, col.name = "prop.mt")
  
  # Restrict proportion MT expression to remove unhappy cells
  seu[[i]] = subset(x = seu[[i]], subset = prop.mt > 0.03 & prop.mt < 0.1)
  
  # Remove potential doublets
  seu[[i]] = subset(x = seu[[i]], subset = nFeature_RNA < 6500 & nCount_RNA < 55000)
  
  seu[[i]] = SCTransform(seu[[i]], verbose = FALSE, vars.to.regress=c("prop.mt", "nFeature_RNA", "nCount_RNA"))
  seu[[i]] = CellCycleScoring(seu[[i]], s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, assay = 'SCT', set.ident = TRUE)
  # Enable this when also adjusting for cell cycle
  seu[[i]] = SCTransform(seu[[i]], verbose = FALSE, vars.to.regress=c("prop.mt", "nFeature_RNA", "nCount_RNA", "S.Score", "G2M.Score"))
  
  seu[[i]] = AddMetaData(object=seu[[i]], metadata=factor(rep(library_names[i], length(seu[[i]]$nCount_RNA)), levels=library_names), col.name="library")
}
# This is required for PrepSCTIntegration, an increase from the R default 512 to 5120 for this dataset
options(future.globals.maxSize=5120*1024^2)

seu_features = SelectIntegrationFeatures(object.list = seu, nfeatures = 3000)
seu = PrepSCTIntegration(object.list = seu, anchor.features = seu_features, verbose = FALSE)
seu = lapply(X = seu, FUN = RunPCA, verbose = FALSE, features = seu_features)
seu_anchors = FindIntegrationAnchors(object.list = seu, normalization.method = "SCT", anchor.features = seu_features, verbose = FALSE, reduction = "rpca")

  |                                                  | 0 % ~calculating  
  |+++++++++                                         | 17% ~40s          
  |+++++++++++++++++                                 | 33% ~27s          
  |+++++++++++++++++++++++++                         | 50% ~19s          
  |++++++++++++++++++++++++++++++++++                | 67% ~15s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~07s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=45s  
seu_integrated = IntegrateData(anchorset = seu_anchors, normalization.method = "SCT", verbose = FALSE)
seu_integrated@meta.data$library = factor(seu_integrated@meta.data$library, levels=library_names)
seu_integrated = RunPCA(seu_integrated, verbose = FALSE)

VlnPlot(seu_integrated, group.by="orig.ident", features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3, pt.size=0.01)
Warning in FetchData(object = object, vars = features, slot = slot) :
  The following requested variables were not found: percent.mito

ElbowPlot(seu_integrated, ndims=50)

At 30 dimensions the tail is very flat already

seu_integrated = RunUMAP(seu_integrated, dims = 1:30)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
23:39:04 UMAP embedding parameters a = 0.9922 b = 1.112
23:39:04 Read 13111 rows and found 30 numeric columns
23:39:04 Using Annoy for neighbor search, n_neighbors = 30
23:39:04 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
23:39:06 Writing NN index file to temp file /var/folders/bz/p63rv1754pv9v6rz33xgmy8sgyrrg3/T//Rtmpjo3Fsa/file57144845a637
23:39:06 Searching Annoy index using 1 thread, search_k = 3000
23:39:11 Annoy recall = 100%
23:39:16 Commencing smooth kNN distance calibration using 1 thread
23:39:19 Initializing from normalized Laplacian + noise
23:39:20 Commencing optimization for 200 epochs, with 566230 positive edges
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
23:39:30 Optimization finished
UMAPPlot(seu_integrated)

p = UMAPPlot(seu_integrated,group.by="library")
p = theme_publication_plot(p, "Library")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_umap_overlay_library.pdf"),
                   base_height=4,
                   base_width=6)


FeaturePlot(seu_integrated, features=c("Krt14", "Tgm3", "Krt4", "Lor"))


VlnPlot(seu_integrated, group.by="orig.ident", features = "prop.mt", ncol = 1, pt.size=0.01)


FeaturePlot(seu_integrated, features=c("nFeature_RNA", "nCount_RNA", "prop.mt"))

Assign a celltype to all cells

Here we consider a number of markers with a threshold each to establish celltypes

celltypes = do.call(rbind, celltypes_all)
tpm = calc_tpm(seu_integrated@assays[["RNA"]]@counts, ref_file_mm10)

# get tpm for marker genes per cluster
seu_integrated = FindNeighbors(seu_integrated, dims = 1:30)
Computing nearest neighbor graph
Computing SNN
seu_integrated = FindClusters(object = seu_integrated, resolution = 0.6, verbose=FALSE)
markers_all_cells = FindAllMarkers(seu_integrated, only.pos = TRUE, min.pct = 0.1, thresh.use = 0.25, verbose = F)
For a more efficient implementation of the Wilcoxon Rank Sum Test,
(default method for FindMarkers) please install the limma package
--------------------------------------------
install.packages('BiocManager')
BiocManager::install('limma')
--------------------------------------------
After installation of limma, Seurat will automatically use the more 
efficient implementation (no further action necessary).
This message will be shown once per session
mdata = seu_integrated@meta.data
mdata$celltype = NA
inboth = intersect(gsub("-1", "", rownames(mdata)), celltypes$cell)
mdata$celltype[match(gsub("-1", "", rownames(mdata)), inboth)] = celltypes$classification[match(inboth, celltypes$cell)]
seu_integrated = AddMetaData(seu_integrated, mdata$celltype, "celltype")

UMAPPlot(seu_integrated, group.by="celltype")


cluster_classification = data.frame(clusterid=sort(unique(mdata$seurat_clusters)))

make_plot = function(mdata, tpm, gene_name) {
  plot_dat = data.frame(cell=rownames(mdata), cluster=mdata$seurat_clusters, tpm=tpm[gene_name,rownames(mdata)])
  plot_dat = plot_dat[is.finite(plot_dat$tpm),]
  plot_dat_clusters  = plot_dat %>% group_by(cluster) %>% summarise(n=n(), mean=mean(tpm), median=median(tpm))
  p2 = ggplot(plot_dat_clusters) + aes(x=cluster, y=median, label=cluster) + geom_text() + theme_cowplot() + ggtitle(gene_name)
  return(list(p=p2, plot_dat_clusters=plot_dat_clusters))
}

make_umap_plot = function(plot_dat, gene_name) {
  plot_dat[, gene_name] = log(plot_dat[, gene_name])
  gene_name_title = paste0(toupper(substr(gene_name, 1, 1)), substr(gene_name, 2, nchar(gene_name)))
  return(ggplot(plot_dat) + aes_string(x="UMAP_1", y="UMAP_2", colour=gene_name) + 
           geom_point(size=0.25) + scale_colour_gradient(low="grey", high="red") +
           theme_cowplot() + ggtitle(gene_name_title))
}

plot_data = as.data.frame(seu_integrated@reductions$umap@cell.embeddings)

# Keratinocytes
p1 = make_plot(mdata, tpm, "Krt14")
cluster_classification$krt14 = p1$plot_dat_clusters$median > 50
plot_data$krt14 = tpm["Krt14", rownames(plot_data)]

p2 = make_plot(mdata, tpm, "Tgm3")
cluster_classification$tgm3 = p2$plot_dat_clusters$median > 5
plot_data$tgm3 = tpm["Tgm3", rownames(plot_data)]

p3 = make_plot(mdata, tpm, "Lor")
cluster_classification$lor = p3$plot_dat_clusters$median > 50
plot_data$lor = tpm["Lor", rownames(plot_data)]

p4 = make_plot(mdata, tpm, "Krt4")
# cluster_classification$tgm3 = plot_dat_clusters$median > 10
plot_data$krt4 = tpm["Krt4", rownames(plot_data)]

plot_grid(plotlist=list(p1$p, p2$p, p3$p, p4$p), ncol=2)

p = plot_grid(plotlist=list(theme_publication_plot(make_umap_plot(plot_data, "krt14"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "tgm3"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "lor"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "krt4"), "log(TPM)")), ncol=2, nrow=2)
print(p)

plot_grid(plotlist=list(make_umap_plot(plot_data, "krt14"),
                        make_umap_plot(plot_data, "krt4")), ncol=2, nrow=2)

plot_grid(plotlist=list(make_umap_plot(plot_data, "krt14"),
                        make_umap_plot(plot_data, "tgm3")), ncol=2, nrow=2)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_markers_keratinocyte.pdf"),
                   base_height=4,
                   base_width=6)



# Fibroblasts and endothelial cells
p1 = make_plot(mdata, tpm, "Col1a1")
cluster_classification$col1a1 = p1$plot_dat_clusters$median > 10
plot_data$col1a1 = tpm["Col1a1", rownames(plot_data)]

p2 = make_plot(mdata, tpm, "Pecam1")
cluster_classification$pecam1 = p2$plot_dat_clusters$median > 2
plot_data$pcam1 = tpm["Pecam1", rownames(plot_data)]

plot_grid(plotlist=list(p1$p, p2$p), ncol=2)

p = plot_grid(plotlist=list(theme_publication_plot(make_umap_plot(plot_data, "col1a1"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "pcam1"), "log(TPM)")), ncol=2, nrow=2)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_markers_fibroblast_endothelial.pdf"),
                   base_height=4,
                   base_width=6)



# Immune cells

# B-cell marker
p1 = make_plot(mdata, tpm, "Cd83")
cluster_classification$cd83 = p1$plot_dat_clusters$median > 1
plot_data$cd83 = tpm["Cd83", rownames(plot_data)]

p2 = make_plot(mdata, tpm, "Cd84")
cluster_classification$cd84 = p2$plot_dat_clusters$median > 1
plot_data$cd84 = tpm["Cd84", rownames(plot_data)]

p3 = make_plot(mdata, tpm, "Cd86")
cluster_classification$cd86 = p3$plot_dat_clusters$median > 0.4
plot_data$cd86 = tpm["Cd86", rownames(plot_data)]

plot_grid(plotlist=list(p1$p, p2$p, p3$p), ncol=2)

plot_grid(plotlist=list(make_umap_plot(plot_data, "cd83"),
                        make_umap_plot(plot_data, "cd84"),
                        make_umap_plot(plot_data, "cd86")), ncol=2, nrow=2)


# T-cells

# t-cell marker
p1 = make_plot(mdata, tpm, "Trbc2")
cluster_classification$trbc2 = p1$plot_dat_clusters$median > 50
plot_data$trbc2 = tpm["Trbc2", rownames(plot_data)]

# not specific
p2 = make_plot(mdata, tpm, "Cd52")
cluster_classification$cd52 = p2$plot_dat_clusters$median > 50
plot_data$cd52 = tpm["Cd52", rownames(plot_data)]

# t-cell marker
p3 = make_plot(mdata, tpm, "Ptprc")
cluster_classification$ptprc = p3$plot_dat_clusters$median > 50
plot_data$ptprc = tpm["Ptprc", rownames(plot_data)]

plot_grid(plotlist=list(p1$p, p2$p, p3$p), ncol=2)

p = plot_grid(plotlist=list(theme_publication_plot(make_umap_plot(plot_data, "ptprc"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "cd52"), "log(TPM)")), ncol=2, nrow=2)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_markers_immune.pdf"),
                   base_height=4,
                   base_width=6)

plot <- DimPlot(object = seu_integrated, reduction = "umap")
LabelClusters(plot = plot, id = 'ident', size=6)

Now lets assign a celltype using the marker thresholds established above

cluster_classification$celltype = NA
cluster_classification$celltype[cluster_classification$krt14 | cluster_classification$tgm3 | cluster_classification$lor] = "keratinocyte"
cluster_classification$celltype[cluster_classification$col1a1] = "fibroblast"
cluster_classification$celltype[cluster_classification$pecam1] = "endothelial"
cluster_classification$celltype[cluster_classification$cd83 | cluster_classification$cd84 | cluster_classification$cd86 | cluster_classification$cd52 | cluster_classification$trbc2] = "immune"
cluster_classification_table = cluster_classification$celltype
names(cluster_classification_table) = as.character(cluster_classification$clusterid)
seu_integrated = AddMetaData(seu_integrated, cluster_classification_table[as.character(seu_integrated@meta.data$seurat_clusters)], "celltype")
UMAPPlot(seu_integrated, group.by="celltype")


plot_dat = as.data.frame(seu_integrated@reductions[["umap"]]@cell.embeddings)
plot_dat$celltype = seu_integrated@meta.data$celltype
plot_dat$celltype = unlist(lapply(plot_dat$celltype, function(x) { paste(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)), sep="") }))
plot_dat$celltype_fctr = factor(plot_dat$celltype, levels=c("Keratinocyte", "Fibroblast", "Immune", "Endothelial"))

A number of cells appear to be assigned to the wrong cluster, based on the UMAP space, set these to NA, just to be sure.

plot_data = as.data.frame(seu_integrated@reductions$umap@cell.embeddings)
plot_data$celltype = seu_integrated@meta.data$celltype
plot_data = plot_data[plot_data$celltype=="keratinocyte",]
plot_data$selection = plot_data$UMAP_1 < -5 | plot_data$UMAP_2 > 5
plot_data$selection_fctr = factor(plot_data$selection, levels=c(TRUE, FALSE))

mycolours = c("red", "grey")
names(mycolours) = c(TRUE, FALSE)

p = ggplot(plot_data) + aes_string(x="UMAP_1", y="UMAP_2", colour="selection_fctr") + 
           geom_point(size=0.25) + scale_colour_manual(values=mycolours) +
           theme_cowplot()
print(p)

seu_integrated@meta.data$celltype[rownames(seu_integrated@meta.data) %in% rownames(plot_data)[plot_data$selection]] = NA
UMAPPlot(seu_integrated, group.by="Phase")


plot_data = as.data.frame(seu_integrated@reductions$umap@cell.embeddings)
plot_data$celltype = seu_integrated@meta.data$celltype
plot_data$celltype = unlist(lapply(plot_data$celltype, function(x) { paste(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)), sep="") }))
plot_data$celltype_fctr = factor(plot_data$celltype, levels=c("Keratinocyte", "Fibroblast", "Immune", "Endothelial"))
mycolours = brewer.pal(4, "Dark2")
p = ggplot(plot_data) + 
  aes(x=UMAP_1, y=UMAP_2, colour=celltype_fctr) + 
  geom_point(size=0.2) + 
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Cell type")
print(p)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5d.pdf"),
                   base_height=4,
                   base_width=6)

Plot Notch1 expression per library

notch1_expr = data.frame(expression=tpm["Notch1", rownames(seu_integrated@meta.data)], library=seu_integrated@meta.data$library)
notch1_expr$library = factor(notch1_expr$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))
p = ggplot(notch1_expr) + aes(x=library, y=expression, fill=library) + geom_boxplot() + 
  theme_cowplot() + xlab("Library") + ylab("TPM")
p = theme_publication_plot(p, "Library", legend_aes = 1)
print(p)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_notch1_expression_all_cells.pdf"),
                   base_height=4,
                   base_width=6)

Save

save(file=paste0(fileprefix, "notch1_batch_effect_with_filter_with_adjustment.RData"), seu_integrated, markers_all_cells)
metdata = seu_integrated@meta.data
metdata$cell = rownames(metdata)
write.table(metdata, file=paste0(fileprefix, "notch1_celltypes_and_metadata.txt"), quote=F, sep="\t", row.names=F)

Cell type counts per library

celltype_count = seu_integrated@meta.data[, c("orig.ident", "celltype")] %>% group_by(orig.ident, celltype) %>% summarise(n=n()) %>% spread(celltype, n)
`summarise()` has grouped output by 'orig.ident'. You can override using the `.groups` argument.
celltype_frac = celltype_count[, 2:ncol(celltype_count)] / rowSums(celltype_count[, 2:ncol(celltype_count)], na.rm=T)
celltype_frac = cbind(data.frame(library=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2")), celltype_frac)
celltype_frac
write.table(celltype_frac, file=paste0(fileprefix, "notch1_cell_type_counts_per_library.txt"), quote=F, sep="\t", row.names=F)


celltype_frac$library = factor(celltype_frac$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))
cell_type_frac_wider = celltype_frac %>% pivot_longer(!library)
cell_type_frac_wider$name = unlist(lapply(cell_type_frac_wider$name, function(x) { paste(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)), sep="") }))
cell_type_frac_wider$name = factor(cell_type_frac_wider$name, levels=levels(plot_dat$celltype_fctr))
p = ggplot(cell_type_frac_wider) + 
  aes(x=library, y=value, fill=name) + 
  geom_bar(position="dodge", stat="identity") + 
  scale_fill_manual(values=mycolours, na.value="grey") +
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cell type")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5e_alt1.pdf"),
                   base_height=4,
                   base_width=6)


p = ggplot(cell_type_frac_wider) + 
  aes(x=library, y=value, fill=name) + 
  geom_bar(stat="identity", position = position_fill(reverse = TRUE)) + 
  scale_fill_manual(values=mycolours, na.value="grey") +
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cell type")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5e_alt2.pdf"),
                   base_height=4,
                   base_width=6)

Analysis keratinocytes

Rerun integration on keratinocytes only. Here we do not perform cell cycle adjustment, as the cell cycle is deeply connected to the tissue dynamics we’re interested in

keratinocyte_barcodes = rownames(seu_integrated@meta.data)[seu_integrated@meta.data$celltype=="keratinocyte" & !is.na(seu_integrated@meta.data$celltype)]

# Initial run found these cells as an outlier cluster and expressing fibroblast markers. Here these are also removed
misidentified_fibroblasts = read.table("20201211_fibroblast_identified_as_keratinocyte.txt", header=T, stringsAsFactors=F)
keratinocyte_barcodes = keratinocyte_barcodes[keratinocyte_barcodes %in% misidentified_fibroblasts$cell[!misidentified_fibroblasts$fibroblast_identified_as_keratinocyte]]

outlier_cells = read.table("20201211_notch1_cluster_assignments_to_remove_outlier_cells.txt", header=T, stringsAsFactors=F)
outlier_cells = outlier_cells[outlier_cells$is_outlier,]
keratinocyte_barcodes = keratinocyte_barcodes[!keratinocyte_barcodes %in% outlier_cells$cell]

seu_kera = list()
for (i in 1:length(expr_matrices)) {
  seu_kera[[i]] = CreateSeuratObject(counts = expr_matrices[[i]][, colnames(expr_matrices[[i]]) %in% keratinocyte_barcodes], min.cells = 30, min.features = 3000)
  
  mito.genes <- rownames(GetAssayData(object=seu_kera[[i]]))[grepl(pattern = "^MT-", x = toupper(rownames(GetAssayData(object=seu_kera[[i]]))))]
  percent.mito <- Matrix::colSums(GetAssayData(object=seu_kera[[i]], slot="counts")[mito.genes, ]) / Matrix::colSums(GetAssayData(object=seu_kera[[i]], slot="counts"))
  seu_kera[[i]] <- AddMetaData(object = seu_kera[[i]], metadata = percent.mito, col.name = "prop.mt")
  
  # Restrict proportion MT expression to remove unhappy cells
  seu_kera[[i]] = subset(x = seu_kera[[i]], subset = prop.mt > 0.03 & prop.mt < 0.1)
  
  # Remove potential doublets
  seu[[i]] = subset(x = seu[[i]], subset = nFeature_RNA < 6500 & nCount_RNA < 55000)
  
  seu_kera[[i]] = SCTransform(seu_kera[[i]], verbose = FALSE, vars.to.regress=c("prop.mt", "nFeature_RNA", "nCount_RNA"))
  seu_kera[[i]] = CellCycleScoring(seu_kera[[i]], s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, assay = 'SCT', set.ident = TRUE)
  # Enable this when also adjusting for cell cycle
  # seu[[i]] = SCTransform(seu[[i]], verbose = FALSE, vars.to.regress=c("prop.mt", "nFeature_RNA", "nCount_RNA", "S.Score", "G2M.Score"))
  
  seu[[i]] = AddMetaData(object=seu[[i]], metadata=rep(library_names[i], length(seu[[i]]$nCount_RNA)), col.name="library")
}
# This is required for PrepSCTIntegration, an increase from the R default 512 to 5120 for this dataset
options(future.globals.maxSize=5120*1024^2)

seu_features = SelectIntegrationFeatures(object.list = seu_kera, nfeatures = 3000)
seu_kera = PrepSCTIntegration(object.list = seu_kera, anchor.features = seu_features, verbose = FALSE)
seu_kera = lapply(X = seu_kera, FUN = RunPCA, verbose = FALSE, features = seu_features)
seu_anchors = FindIntegrationAnchors(object.list = seu_kera, normalization.method = "SCT", anchor.features = seu_features, verbose = FALSE, reduction = "rpca")

  |                                                  | 0 % ~calculating  
  |+++++++++                                         | 17% ~27s          
  |+++++++++++++++++                                 | 33% ~18s          
  |+++++++++++++++++++++++++                         | 50% ~14s          
  |++++++++++++++++++++++++++++++++++                | 67% ~10s          
  |++++++++++++++++++++++++++++++++++++++++++        | 83% ~05s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=33s  
seu_kera_integrated = IntegrateData(anchorset = seu_anchors, normalization.method = "SCT", verbose = FALSE)
library_per_cell = factor(unlist(lapply(seu, function(x) x@meta.data$library)), levels=library_names)
names(library_per_cell) = unlist(lapply(seu, function(x) rownames(x@meta.data)))
seu_kera_integrated = AddMetaData(object=seu_kera_integrated, metadata=library_per_cell, col.name="library")
seu_kera_integrated = RunPCA(seu_kera_integrated, verbose = FALSE)

VlnPlot(seu_kera_integrated, group.by="orig.ident", features = c("nFeature_RNA", "nCount_RNA", "prop.mt"), ncol = 3, pt.size=0.01)

ElbowPlot(seu_kera_integrated, ndims=50)

This time we pick 10 dimensions, the point where the tail flattens off

UMAPPlot(seu_kera_integrated)
p = UMAPPlot(seu_kera_integrated,group.by="library")
p = theme_publication_plot(p, "Library") + ggtitle(NULL) + xlab("UMAP 1") + ylab("UMAP 2")
print(p)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_umap_keratinocytes_overlay_library.pdf"),
                   base_height=4,
                   base_width=6)



UMAPPlot(seu_kera_integrated,group.by="Phase")


plot_dat = as.data.frame(seu_kera_integrated@reductions[["umap"]]@cell.embeddings)
plot_dat$krt14 = log(tpm["Krt14", rownames(seu_kera_integrated@meta.data)])
plot_dat$tgm3 = log(tpm["Tgm3", rownames(seu_kera_integrated@meta.data)])
plot_dat$krt4 = log(tpm["Krt4", rownames(seu_kera_integrated@meta.data)])
plot_dat$lor = log(tpm["Lor", rownames(seu_kera_integrated@meta.data)])

p1 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=krt14) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Krt14"), legend_title = "log(TPM)")
p2 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=tgm3) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Tgm3"), legend_title = "log(TPM)")
p3 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=krt4) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Krt4"), legend_title = "log(TPM)")
p4 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=lor) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Lor"), legend_title = "log(TPM)")
p = plot_grid(p1, p2, p3, p4, ncol=2)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_keratinocytes_marker_expression.pdf"),
                   base_height=4,
                   base_width=6)


VlnPlot(seu_kera_integrated, group.by="orig.ident", features = "prop.mt", ncol = 1, pt.size=0.01)


FeaturePlot(seu_kera_integrated, features=c("nFeature_RNA", "nCount_RNA", "prop.mt"))


plot_dat = as.data.frame(seu_kera_integrated@reductions$umap@cell.embeddings)
plot_dat$phase = seu_kera_integrated@meta.data$Phase
plot_dat$phase = factor(plot_dat$phase, levels=c("G1", "G2M", "S"))
# mycolours = brewer.pal(4, "Dark2")
p = ggplot(plot_dat) + 
  aes(x=UMAP_1, y=UMAP_2, colour=phase) + 
  geom_point(size=0.2) + 
  xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Cell type")
print(p)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5h.pdf"),
                   base_height=4,
                   base_width=6)


phase_count = seu_kera_integrated@meta.data %>% group_by(library, Phase) %>% summarise(n=n()) %>% mutate(frac = n/sum(n))
`summarise()` has grouped output by 'library'. You can override using the `.groups` argument.
phase_count$library = factor(phase_count$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))
p = ggplot(phase_count) + 
  aes(x=library, y=frac, fill=Phase) + 
  geom_bar(position="dodge", stat="identity") + 
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cycle phase")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5i_alt1.pdf"),
                   base_height=4,
                   base_width=6)


p = ggplot(phase_count) + 
  aes(x=library, y=frac, fill=Phase) + 
  geom_bar(stat="identity", position = position_fill(reverse = TRUE)) + 
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cycle phase")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5i_alt2.pdf"),
                   base_height=4,
                   base_width=6)

Plot the cluster annotations in two ways - to match the heatmap further down.

p <- DimPlot(object = seu_kera_integrated, reduction = "umap")
p = LabelClusters(plot = p, id = 'ident', size=6)
p = p + xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Cluster") + ggtitle(NULL)
print(p)
# cowplot::save_plot(plot=p,
#                    filename = paste0(fileprefix, "cluster_annotations.pdf"),
#                    base_height=4,
#                    base_width=6)

seu_kera_integrated@meta.data$seurat_clusters_reordered = factor(as.numeric(as.character(seu_kera_integrated@meta.data$seurat_clusters)), levels=c(8,6,9,4,10,0,1,11,3,5,2,7))
p <- DimPlot(object = seu_kera_integrated, reduction = "umap", group.by = "seurat_clusters_reordered")
p = LabelClusters(plot = p, id = 'seurat_clusters_reordered', size=6)
p = p + xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Cluster") + ggtitle(NULL)
print(p)

# cowplot::save_plot(plot=p,
#                    filename = paste0(fileprefix, "cluster_annotations_alt.pdf"),
#                    base_height=4,
#                    base_width=6)
write.table(markers_keratinocytes, file=paste0(fileprefix, "signif_expr_markers_keratinocyte_analysis.txt"), quote=F, sep="\t", row.names=F)

Determine number of basal cells

# established to broadly capture the changeover point where Krt14 drops of and Tgm3 starts going up
slope = -1.5
intercept = -4.5

add_threshold = function(p, slope, intercept) {
  return(p + geom_abline(slope=slope, intercept=intercept, colour="black", linetype=2))
}

pc1 = as.data.frame(seu_kera_integrated@reductions[["umap"]]@cell.embeddings[, c(1, 2), drop=F])
pc1$library = as.character(seu_kera_integrated@meta.data$library[match(rownames(pc1), rownames(seu_kera_integrated@meta.data))])
pc1$selected = ifelse(pc1$UMAP_2 < (intercept+slope*pc1$UMAP_1), 0, 1)
pc1$selected = factor(pc1$selected, levels=c(1, 0))
pc1$layer = "Basal"
pc1$layer[pc1$selected=="0"] = "Suprabasal"
pc1$layer = factor(pc1$layer, levels=c("Basal", "Suprabasal"))

Plot the expression of a number of markers relative to the selected threshold to show we’ve broadly captured the crossover area

plot_dat = as.data.frame(seu_kera_integrated@reductions[["umap"]]@cell.embeddings)
plot_dat$krt14 = log(tpm["Krt14", rownames(seu_kera_integrated@meta.data)])
plot_dat$tgm3 = log(tpm["Tgm3", rownames(seu_kera_integrated@meta.data)])
plot_dat$krt4 = log(tpm["Krt4", rownames(seu_kera_integrated@meta.data)])
plot_dat$lor = log(tpm["Lor", rownames(seu_kera_integrated@meta.data)])

p1 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=krt14) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Krt14"), legend_title = "log(TPM)")
p2 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=tgm3) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Tgm3"), legend_title = "log(TPM)")
p3 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=krt4) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Krt4"), legend_title = "log(TPM)")
p4 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=lor) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Lor"), legend_title = "log(TPM)")
p = plot_grid(add_threshold(p1, slope, intercept), 
              add_threshold(p2, slope, intercept), 
              add_threshold(p3, slope, intercept), 
              add_threshold(p4, slope, intercept), ncol=2)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_basal_cell_threshold_vs_markers.pdf"),
                   base_height=4,
                   base_width=6)

Plot the threshold per library to show which cells have been marked as basal cells

mycolours = c("red", "grey")
p1 = ggplot(pc1[pc1$library=="WT 1",]) +
  aes(x=UMAP_1, y=UMAP_2, colour=layer) +
  geom_point(size=0.2) +
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("WT 1") + theme_cowplot() + theme(legend.position="none")

p2 = ggplot(pc1[pc1$library=="WT 2",]) +
  aes(x=UMAP_1, y=UMAP_2, colour=layer) +
  geom_point(size=0.2) +
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("WT 2") + theme_cowplot() + theme(legend.position="none") 

p3 = ggplot(pc1[pc1$library=="HOM KO 1",]) +
  aes(x=UMAP_1, y=UMAP_2, colour=layer) +
  geom_point(size=0.2) +
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("HOM KO 1") + theme_cowplot() + theme(legend.position="none")

p4 = ggplot(pc1[pc1$library=="HOM KO 2",]) +
  aes(x=UMAP_1, y=UMAP_2, colour=layer) +
  geom_point(size=0.2) +
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("HOM KO 2") + theme_cowplot() + theme(legend.position="none")

p = plot_grid(add_threshold(p1, slope, intercept), 
              add_threshold(p2, slope, intercept), 
              add_threshold(p3, slope, intercept), 
              add_threshold(p4, slope, intercept))
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_basal_cell_classification_map.pdf"),
                   base_height=4,
                   base_width=6)

Summarise the calls into a table

basal_cell_count = pc1 %>% group_by(selected, library) %>% summarise(n=n()) %>% pivot_wider(names_from=selected, values_from=n)
`summarise()` has grouped output by 'selected'. You can override using the `.groups` argument.
colnames(basal_cell_count)[2:3] = c("num_basal", "num_suprabasal")
basal_cell_count[, c("frac_basal", "frac_suprabasal")] = basal_cell_count[, 2:3] / rowSums(basal_cell_count[, 2:3])
print(basal_cell_count)
write.table(basal_cell_count, file=paste0(fileprefix, "notch1_fraction_basal_cells_umap_threshold.txt"), quote=F, sep="\t", row.names=F)

Make the summary figures for the paper


mycolours = c("red", "grey")
p = ggplot(pc1) + 
  aes(x=UMAP_1, y=UMAP_2, colour=layer) + 
  geom_point(size=0.2) + 
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Layer")
print(p)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5f.pdf"),
                   base_height=4,
                   base_width=6)


basal_cell_count = basal_cell_count[,c("library", "frac_basal", "frac_suprabasal")]
basal_cell_count$library = factor(basal_cell_count$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))
colnames(basal_cell_count)[2:3] = c("Basal", "Suprabasal")
p = ggplot(basal_cell_count %>% pivot_longer(!library)) + 
  aes(x=library, y=value, fill=name) + 
  geom_bar(position="dodge", stat="identity") + 
  scale_fill_manual(values=mycolours, na.value="grey") +
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Layer")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5g_alt1.pdf"),
                   base_height=4,
                   base_width=6)


p = ggplot(basal_cell_count %>% pivot_longer(!library)) + 
  aes(x=library, y=value, fill=name) +
  geom_bar(stat="identity", position = position_fill(reverse = TRUE)) + 
  scale_fill_manual(values=mycolours, na.value="grey") +
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Layer")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5g_alt2.pdf"),
                   base_height=4,
                   base_width=6)

Save the outcome

# save the Krt4 estimates
metadat = seu_kera_integrated@meta.data
metadat$cell = rownames(metadat)

metadat = metadat[, c("cell", "orig.ident", "Phase", "prop.mt", "nCount_RNA", "nFeature_RNA")]
colnames(metadat)[2] = "library"

metadat$is_basal = pc1$layer=="Basal"
write.table(metadat, file=paste0(fileprefix, "notch1_basal_suprabasal_calls_umap_threshold.txt"), quote=F, sep="\t", row.names=F)

Plot percentage basal cells per cell cycle phase

phase_count = seu_kera_integrated@meta.data[metadat$is_basal,] %>% group_by(library, Phase) %>% summarise(n=n()) %>% mutate(frac = n/sum(n))
`summarise()` has grouped output by 'library'. You can override using the `.groups` argument.
phase_count$library = factor(phase_count$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))

p = ggplot(phase_count) + 
  aes(x=library, y=frac, fill=Phase) + 
  geom_bar(stat="identity", position = position_fill(reverse = TRUE)) + 
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cycle phase")
print(p)


p = ggplot(phase_count) + 
  aes(x=library, y=n, fill=Phase) + 
  geom_bar(stat="identity", position = "dodge") + 
  xlab("Library") + ylab("Number of cells per library")
p = theme_publication_plot(p, "Cycle phase")
print(p)

save(file=paste0(fileprefix, "notch1_keratinocytes_batch_effect_with_filter_with_adjustment.RData"), seu_kera_integrated, markers_keratinocytes, cluster_lib_count)

Additional plots

Downsampled plot showing equal cells per library

counts_per_library = table(seu_integrated@meta.data$library)
set.seed(123)
selected_cells = rownames(seu_integrated@meta.data)[seu_integrated@meta.data$library=="WT 1"][sample(1:counts_per_library["WT 1"], 1500)]

selected_cells = c(selected_cells, rownames(seu_integrated@meta.data)[seu_integrated@meta.data$library=="WT 2"][sample(1:counts_per_library["WT 2"], 1500)])

selected_cells = c(selected_cells, rownames(seu_integrated@meta.data)[seu_integrated@meta.data$library=="HOM KO 1"][sample(1:counts_per_library["HOM KO 1"], 1500)])

selected_cells = c(selected_cells, rownames(seu_integrated@meta.data)[seu_integrated@meta.data$library=="HOM KO 2"][sample(1:counts_per_library["HOM KO 2"], 1500)])



seu_integrated_temp = subset(seu_integrated, cells=selected_cells)

p = UMAPPlot(seu_integrated_temp,group.by="library")
p = theme_publication_plot(p, "Library") + ggtitle(NULL) + xlab("UMAP 1") + ylab("UMAP 2")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_umap_overlay_library_downsampled1500.pdf"),
                   base_height=4,
                   base_width=6)


counts_per_library = table(seu_kera_integrated@meta.data$library)
set.seed(123)
selected_cells = rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$library=="WT 1"][sample(1:counts_per_library["WT 1"], 1400)]

selected_cells = c(selected_cells, rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$library=="WT 2"][sample(1:counts_per_library["WT 2"], 1400)])

selected_cells = c(selected_cells, rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$library=="HOM KO 1"][sample(1:counts_per_library["HOM KO 1"], 1400)])

selected_cells = c(selected_cells, rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$library=="HOM KO 2"][sample(1:counts_per_library["HOM KO 2"], 1400)])


seu_integrated_temp = subset(seu_kera_integrated, cells=selected_cells)

p = UMAPPlot(seu_integrated_temp,group.by="library")
p = theme_publication_plot(p, "Library") + ggtitle(NULL) + xlab("UMAP 1") + ylab("UMAP 2")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_umap_overlay_library_keratinocytes_downsampled1400.pdf"),
                   base_height=4,
                   base_width=6)

# load("20210717_notch1_batch_effect_with_filter_with_adjustment.RData")

Plot heatmap with selected marker genes from McGin et al expression per cluster

load(paste0(fileprefix, "notch1_keratinocytes_batch_effect_with_filter_with_adjustment.RData"))
basal_markers = c("Cdh3", "Itgb1", "Krt15", "Krt14", "Krt5", "Col17a1", "Sox2", "Trp63")
cycle_markers = c("Gmnn", "Mcm6", "Mcm2", "Cdt1", "Pcna", "Ccne1", "E2f1", "Ccne2", "Cdc6", "Aurkb", "Top2a", "Ccnb2", "Bub1", "Ube2c", "Aurka", "Kif23", "Ccnb1", "Mki67", "Mad2l1", "Birc5")
diff_markers = c("Krt13", "Klf4", "Tgm3", "Sbsn", "Grhl3", "Krt4", "Notch3")
seu_kera_integrated@meta.data$seurat_clusters_reordered = factor(as.numeric(as.character(seu_kera_integrated@meta.data$seurat_clusters)), levels=c(8,6,9,4,10,0,1,11,3,5,2,7))
# p = DoHeatmap(seu_kera_integrated, features = c(basal_markers, cycle_markers, diff_markers), group.by="seurat_clusters_reordered") + NoLegend()
# print(p)
# cowplot::save_plot(plot=p,
#                     filename = "figure_heatmap_all_libraries.pdf",
#                     base_height=4,
#                     base_width=8)
p = DoHeatmap(seu_kera_integrated, 
          cells = rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$orig.ident %in% c("c1", "c2")],
          features = c(basal_markers, cycle_markers, diff_markers), group.by="seurat_clusters_reordered") + NoLegend()
Warning in DoHeatmap(seu_kera_integrated, cells = rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$orig.ident %in%  :
  The following features were omitted as they were not found in the scale.data slot for the integrated assay: Cdc6
print(p)
cowplot::save_plot(plot=p,
                    filename = "figure_heatmap_control_libraries.pdf",
                    base_height=4,
                    base_width=8)

p = DoHeatmap(seu_kera_integrated, 
          cells = rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$orig.ident %in% c("s1", "s2")],
          features = c(basal_markers, cycle_markers, diff_markers), group.by="seurat_clusters_reordered") + NoLegend()
Warning in DoHeatmap(seu_kera_integrated, cells = rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$orig.ident %in%  :
  The following features were omitted as they were not found in the scale.data slot for the integrated assay: Cdc6
print(p)
cowplot::save_plot(plot=p,
                    filename = "figure_heatmap_ko_libraries.pdf",
                    base_height=4,
                    base_width=8)

Save a table with counts per cluster per library

cluster_counts = list()
for (cluster in unique(seu_kera_integrated@meta.data$seurat_clusters)) {
  res = seu_kera_integrated@meta.data[seu_kera_integrated@meta.data$seurat_clusters==cluster,] %>% group_by(library) %>% summarise(n=n())
  res$cluster = cluster
  cluster_counts[[cluster]] = res
}
cluster_counts = do.call(rbind, cluster_counts)
write.table(cluster_counts, file=paste0(fileprefix, "_cluster_library_cell_counts.txt"), quote=F, sep="\t", row.names=F)
---
title: "Notch1 - analysis v1.2"
output: 
  html_document:
    toc: yes
  html_notebook:
    theme: united
    toc: yes
---

# Functions and imports
```{r, message=FALSE, warning=FALSE, class.source = 'fold-hide'}
fileprefix = "20210717_"

ref_file_mm10 = "~/Documents/reference_data/mouse/ensembl/ensembl_gene_length_mm10.txt.gz"

# This is required to set the fonts of the paper plots to Arial
library(extrafont)
font_import(prompt=FALSE)
loadfonts()

library(RColorBrewer)
library(Seurat)
library(scater)
library(cowplot)
library(dplyr)
library(tidyr)
library(SoupX)

calc_tpm = function(dat, ref_file) {
  mm_annot = read.table(ref_file, header=T, stringsAsFactors=F, sep="\t")
  eff_length = mm_annot$eff_length[match(rownames(dat), mm_annot$mgi_symbol)]
  eff_length[!is.finite(eff_length)] = 1
  
  sce = SingleCellExperiment(assays=list(counts=as.matrix(dat), logcounts=log2(as.matrix(dat)+1)))
  tpm = calculateTPM(sce, eff_length)
  return(tpm)
}

theme_publication_plot = function(p, legend_title, legend_aes=4) {
  p = p +
  theme_cowplot() + 
  theme(axis.text = element_text(family="Arial", colour="black",size=10,face="plain"),
                       axis.title = element_text(family="Arial", colour="black",size=12,face="plain"),
                       strip.text.x = element_text(family="Arial", colour="black",size=12,face="plain", angle=90),
                       strip.text.y = element_text(family="Arial", colour="black",size=12,face="plain", angle=360),
                       strip.background = element_blank(),
                       panel.spacing.x = unit(1, "mm"),
                       panel.spacing.y = unit(3, "mm"),
                       plot.title = element_text(family="Arial", colour="black",size=12,face="plain",hjust = 0.5))
  if (!is.null(legend_title)) {
     p = p + guides(fill = guide_legend(title=legend_title,
                              override.aes = list(size=legend_aes)),
                            colour = guide_legend(title=legend_title,
                              override.aes = list(size=legend_aes)))
  }
  return(p)
}
```

# Load the data

```{r, message=FALSE, warning=FALSE}
samplenames = c("EA_WT_1", "EA_WT_2", "EA_NOTCH1_HOM_1", "EA_NOTCH1_HOM_2")
library_names = c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2")
prefixes = c("c1_", "c2_", "s1_", "s2_")
expr_matrices = list()
# celltypes_all = list()
for (i in 1:length(samplenames)) {
  prefix = prefixes[i]
  in_path = file.path("../cellranger/", samplenames[i])
  sc = load10X(in_path)
  
  out = sc$toc
  colnames(out) = paste0(prefix, colnames(out))

  expr_matrices[[length(expr_matrices)+1]] = out
  
  # This loads a previous set of cell type assignments that was used to compare the new setup
  # celltypes = read.table(file.path("../identify_celltypes", samplenames[i], "celltypes_per_cell.txt"), header=T, stringsAsFactors=F)
  # celltypes$cell = paste0(prefix, celltypes$cell)
  # celltypes_all[[length(celltypes_all)+1]] = celltypes
}
```

# Read in some CellRanger stats
```{r, warning = FALSE, message = FALSE}
stats = list()
for (i in 1:length(samplenames)) {
  stats_sample = read.table(file.path("../cellranger/", samplenames[i], "metrics_summary.csv"), sep=",", header=T, stringsAsFactors=F)
  stats_sample$library = samplenames[i]
  stats[[samplenames[i]]] = stats_sample
}
stats = do.call(rbind, stats)
print(stats)
```


# Explore to identify filters

Here we plot a summary of general QC metrics to identify unhappy cells to be filtered
```{r, warning = FALSE, message = FALSE}
nFeature_RNA_min = 2500
nFeature_RNA_max = 6500
nCount_RNA_max = 55000
prop.mt_min = 0.03
prop.mt_max = 0.1

seu = list()
for (i in 1:length(expr_matrices)) {
  seu[[i]] = CreateSeuratObject(counts = expr_matrices[[i]], min.cells = 0, min.features = 0)
  
  mito.genes = rownames(GetAssayData(object=seu[[i]]))[grepl(pattern = "^MT-", x = toupper(rownames(GetAssayData(object=seu[[i]]))))]
  percent.mito = Matrix::colSums(GetAssayData(object=seu[[i]], slot="counts")[mito.genes, ]) / Matrix::colSums(GetAssayData(object=seu[[i]], slot="counts"))
  seu[[i]] = AddMetaData(object = seu[[i]], metadata = percent.mito, col.name = "prop.mt")
  
  # plot prop.mt vs nFeature_RNA and vs nCount_RNA
  p1 = ggplot(seu[[i]]@meta.data) + aes(x=prop.mt, y=nFeature_RNA) + geom_point(size=0.1, colour="red") + scale_y_log10() + theme_cowplot() + geom_vline(xintercept=c(prop.mt_min, prop.mt_max), linetype=2) + geom_hline(yintercept=c(nFeature_RNA_min, nFeature_RNA_max), linetype=2)
  
  p2 = ggplot(seu[[i]]@meta.data) + aes(x=prop.mt, y=nCount_RNA) + geom_point(size=0.1, colour="blue") + scale_y_log10() + theme_cowplot() + geom_vline(xintercept=c(prop.mt_min, prop.mt_max), linetype=2) + geom_hline(yintercept=c(nCount_RNA_max), linetype=2)
  
  print(plot_grid(p1, p2))
  
  p1 = VlnPlot(seu[[i]], group.by="orig.ident", features = c( "nFeature_RNA"), ncol = 1, pt.size=0) +
    geom_hline(yintercept = c(nFeature_RNA_min, nFeature_RNA_max), linetype=2) + theme(legend.position="none")
  p2 = VlnPlot(seu[[i]], group.by="orig.ident", features = c( "nCount_RNA"), ncol = 1, pt.size=0) +
    geom_hline(yintercept = c(nCount_RNA_max), linetype=2) + theme(legend.position="none")
  p3 = VlnPlot(seu[[i]], group.by="orig.ident", features = c( "prop.mt"), ncol = 1, pt.size=0) + theme(legend.position="none") + geom_hline(yintercept = c(prop.mt_max, prop.mt_min), linetype=2)
  
  # if (grepl("KO", samplenames[i], fixed=T)) {
  #   p3 = p3 + geom_hline(yintercept = c(0.1, 0.002), linetype=2)
  # } else {
  #   p3 = p3 + geom_hline(yintercept = c(0.1, 0.01), linetype=2)
  # }
  
  print(plot_grid(p1, p2, p3, ncol=3))
}
```

# All cells
## Apply batch correction - MT filters and adjust for MT, nGenes and cell cycle
```{r, warning = FALSE, message = FALSE}
seu = list()
for (i in 1:length(expr_matrices)) {
  seu[[i]] = CreateSeuratObject(counts = expr_matrices[[i]], min.cells = 30, min.features = 2500)
  
  mito.genes = rownames(GetAssayData(object=seu[[i]]))[grepl(pattern = "^MT-", x = toupper(rownames(GetAssayData(object=seu[[i]]))))]
  percent.mito = Matrix::colSums(GetAssayData(object=seu[[i]], slot="counts")[mito.genes, ]) / Matrix::colSums(GetAssayData(object=seu[[i]], slot="counts"))
  seu[[i]] = AddMetaData(object = seu[[i]], metadata = percent.mito, col.name = "prop.mt")
  
  # Restrict proportion MT expression to remove unhappy cells
  seu[[i]] = subset(x = seu[[i]], subset = prop.mt > 0.03 & prop.mt < 0.1)
  
  # Remove potential doublets
  seu[[i]] = subset(x = seu[[i]], subset = nFeature_RNA < 6500 & nCount_RNA < 55000)
  
  seu[[i]] = SCTransform(seu[[i]], verbose = FALSE, vars.to.regress=c("prop.mt", "nFeature_RNA", "nCount_RNA"))
  seu[[i]] = CellCycleScoring(seu[[i]], s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, assay = 'SCT', set.ident = TRUE)
  # Enable this when also adjusting for cell cycle
  seu[[i]] = SCTransform(seu[[i]], verbose = FALSE, vars.to.regress=c("prop.mt", "nFeature_RNA", "nCount_RNA", "S.Score", "G2M.Score"))
  
  seu[[i]] = AddMetaData(object=seu[[i]], metadata=factor(rep(library_names[i], length(seu[[i]]$nCount_RNA)), levels=library_names), col.name="library")
}
```

```{r, warning = FALSE, message = FALSE}
# This is required for PrepSCTIntegration, an increase from the R default 512 to 5120 for this dataset
options(future.globals.maxSize=5120*1024^2)

seu_features = SelectIntegrationFeatures(object.list = seu, nfeatures = 3000)
seu = PrepSCTIntegration(object.list = seu, anchor.features = seu_features, verbose = FALSE)
seu = lapply(X = seu, FUN = RunPCA, verbose = FALSE, features = seu_features)
seu_anchors = FindIntegrationAnchors(object.list = seu, normalization.method = "SCT", anchor.features = seu_features, verbose = FALSE, reduction = "rpca")
seu_integrated = IntegrateData(anchorset = seu_anchors, normalization.method = "SCT", verbose = FALSE)
seu_integrated@meta.data$library = factor(seu_integrated@meta.data$library, levels=library_names)
```

```{r}
seu_integrated = RunPCA(seu_integrated, verbose = FALSE)

VlnPlot(seu_integrated, group.by="orig.ident", features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3, pt.size=0.01)
ElbowPlot(seu_integrated, ndims=50)
```
At 30 dimensions the tail is very flat already
```{r}
seu_integrated = RunUMAP(seu_integrated, dims = 1:30)
UMAPPlot(seu_integrated)
p = UMAPPlot(seu_integrated,group.by="library")
p = theme_publication_plot(p, "Library")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_umap_overlay_library.pdf"),
                   base_height=4,
                   base_width=6)

FeaturePlot(seu_integrated, features=c("Krt14", "Tgm3", "Krt4", "Lor"))

VlnPlot(seu_integrated, group.by="orig.ident", features = "prop.mt", ncol = 1, pt.size=0.01)

FeaturePlot(seu_integrated, features=c("nFeature_RNA", "nCount_RNA", "prop.mt"))
```


### Assign a celltype to all cells

Here we consider a number of markers with a threshold each to establish celltypes
```{r}
# celltypes = do.call(rbind, celltypes_all)
tpm = calc_tpm(seu_integrated@assays[["RNA"]]@counts, ref_file_mm10)

# get tpm for marker genes per cluster
seu_integrated = FindNeighbors(seu_integrated, dims = 1:30)
seu_integrated = FindClusters(object = seu_integrated, resolution = 0.6, verbose=FALSE)
markers_all_cells = FindAllMarkers(seu_integrated, only.pos = TRUE, min.pct = 0.1, thresh.use = 0.25, verbose = F)

mdata = seu_integrated@meta.data
# Overlay a previous set of cell type annotations to compare
# mdata$celltype = NA
# inboth = intersect(gsub("-1", "", rownames(mdata)), celltypes$cell)
# mdata$celltype[match(gsub("-1", "", rownames(mdata)), inboth)] = celltypes$classification[match(inboth, celltypes$cell)]
# seu_integrated = AddMetaData(seu_integrated, mdata$celltype, "celltype")

# UMAPPlot(seu_integrated, group.by="celltype")

cluster_classification = data.frame(clusterid=sort(unique(mdata$seurat_clusters)))

make_plot = function(mdata, tpm, gene_name) {
  plot_dat = data.frame(cell=rownames(mdata), cluster=mdata$seurat_clusters, tpm=tpm[gene_name,rownames(mdata)])
  plot_dat = plot_dat[is.finite(plot_dat$tpm),]
  plot_dat_clusters  = plot_dat %>% group_by(cluster) %>% summarise(n=n(), mean=mean(tpm), median=median(tpm))
  p2 = ggplot(plot_dat_clusters) + aes(x=cluster, y=median, label=cluster) + geom_text() + theme_cowplot() + ggtitle(gene_name)
  return(list(p=p2, plot_dat_clusters=plot_dat_clusters))
}

make_umap_plot = function(plot_dat, gene_name) {
  plot_dat[, gene_name] = log(plot_dat[, gene_name])
  gene_name_title = paste0(toupper(substr(gene_name, 1, 1)), substr(gene_name, 2, nchar(gene_name)))
  return(ggplot(plot_dat) + aes_string(x="UMAP_1", y="UMAP_2", colour=gene_name) + 
           geom_point(size=0.25) + scale_colour_gradient(low="grey", high="red") +
           theme_cowplot() + ggtitle(gene_name_title))
}

plot_data = as.data.frame(seu_integrated@reductions$umap@cell.embeddings)

# Keratinocytes
p1 = make_plot(mdata, tpm, "Krt14")
cluster_classification$krt14 = p1$plot_dat_clusters$median > 50
plot_data$krt14 = tpm["Krt14", rownames(plot_data)]

p2 = make_plot(mdata, tpm, "Tgm3")
cluster_classification$tgm3 = p2$plot_dat_clusters$median > 5
plot_data$tgm3 = tpm["Tgm3", rownames(plot_data)]

p3 = make_plot(mdata, tpm, "Lor")
cluster_classification$lor = p3$plot_dat_clusters$median > 50
plot_data$lor = tpm["Lor", rownames(plot_data)]

p4 = make_plot(mdata, tpm, "Krt4")
plot_data$krt4 = tpm["Krt4", rownames(plot_data)]

plot_grid(plotlist=list(p1$p, p2$p, p3$p, p4$p), ncol=2)
p = plot_grid(plotlist=list(theme_publication_plot(make_umap_plot(plot_data, "krt14"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "tgm3"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "lor"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "krt4"), "log(TPM)")), ncol=2, nrow=2)
print(p)
plot_grid(plotlist=list(make_umap_plot(plot_data, "krt14"),
                        make_umap_plot(plot_data, "krt4")), ncol=2, nrow=2)
plot_grid(plotlist=list(make_umap_plot(plot_data, "krt14"),
                        make_umap_plot(plot_data, "tgm3")), ncol=2, nrow=2)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_markers_keratinocyte.pdf"),
                   base_height=4,
                   base_width=6)


# Fibroblasts and endothelial cells

p1 = make_plot(mdata, tpm, "Col1a1")
cluster_classification$col1a1 = p1$plot_dat_clusters$median > 10
plot_data$col1a1 = tpm["Col1a1", rownames(plot_data)]

p2 = make_plot(mdata, tpm, "Pecam1")
cluster_classification$pecam1 = p2$plot_dat_clusters$median > 2
plot_data$pcam1 = tpm["Pecam1", rownames(plot_data)]

plot_grid(plotlist=list(p1$p, p2$p), ncol=2)
p = plot_grid(plotlist=list(theme_publication_plot(make_umap_plot(plot_data, "col1a1"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "pcam1"), "log(TPM)")), ncol=2, nrow=2)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_markers_fibroblast_endothelial.pdf"),
                   base_height=4,
                   base_width=6)


# Immune cells

# B-cell marker
p1 = make_plot(mdata, tpm, "Cd83")
cluster_classification$cd83 = p1$plot_dat_clusters$median > 1
plot_data$cd83 = tpm["Cd83", rownames(plot_data)]

p2 = make_plot(mdata, tpm, "Cd84")
cluster_classification$cd84 = p2$plot_dat_clusters$median > 1
plot_data$cd84 = tpm["Cd84", rownames(plot_data)]

p3 = make_plot(mdata, tpm, "Cd86")
cluster_classification$cd86 = p3$plot_dat_clusters$median > 0.4
plot_data$cd86 = tpm["Cd86", rownames(plot_data)]

plot_grid(plotlist=list(p1$p, p2$p, p3$p), ncol=2)
plot_grid(plotlist=list(make_umap_plot(plot_data, "cd83"),
                        make_umap_plot(plot_data, "cd84"),
                        make_umap_plot(plot_data, "cd86")), ncol=2, nrow=2)

# t-cell marker
p1 = make_plot(mdata, tpm, "Trbc2")
cluster_classification$trbc2 = p1$plot_dat_clusters$median > 50
plot_data$trbc2 = tpm["Trbc2", rownames(plot_data)]

# not specific
p2 = make_plot(mdata, tpm, "Cd52")
cluster_classification$cd52 = p2$plot_dat_clusters$median > 50
plot_data$cd52 = tpm["Cd52", rownames(plot_data)]

# t-cell marker
p3 = make_plot(mdata, tpm, "Ptprc")
cluster_classification$ptprc = p3$plot_dat_clusters$median > 0.25
plot_data$ptprc = tpm["Ptprc", rownames(plot_data)]

plot_grid(plotlist=list(p1$p, p2$p, p3$p), ncol=2)
p = plot_grid(plotlist=list(theme_publication_plot(make_umap_plot(plot_data, "ptprc"), "log(TPM)"),
                        theme_publication_plot(make_umap_plot(plot_data, "cd52"), "log(TPM)")), ncol=2, nrow=2)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_markers_immune.pdf"),
                   base_height=4,
                   base_width=6)
```

```{r}
plot <- DimPlot(object = seu_integrated, reduction = "umap")
LabelClusters(plot = plot, id = 'ident', size=6)
```

Now lets assign a celltype using the marker thresholds established above
```{r}
cluster_classification$celltype = NA
cluster_classification$celltype[cluster_classification$krt14 | cluster_classification$tgm3 | cluster_classification$lor] = "keratinocyte"
cluster_classification$celltype[cluster_classification$col1a1] = "fibroblast"
cluster_classification$celltype[cluster_classification$pecam1] = "endothelial"
cluster_classification$celltype[cluster_classification$cd83 | cluster_classification$cd84 | cluster_classification$cd86 | cluster_classification$cd52 | cluster_classification$ptprc] = "immune"
cluster_classification_table = cluster_classification$celltype
names(cluster_classification_table) = as.character(cluster_classification$clusterid)
seu_integrated = AddMetaData(seu_integrated, cluster_classification_table[as.character(seu_integrated@meta.data$seurat_clusters)], "celltype")
UMAPPlot(seu_integrated, group.by="celltype")

plot_dat = as.data.frame(seu_integrated@reductions[["umap"]]@cell.embeddings)
plot_dat$celltype = seu_integrated@meta.data$celltype
plot_dat$celltype = unlist(lapply(plot_dat$celltype, function(x) { paste(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)), sep="") }))
plot_dat$celltype_fctr = factor(plot_dat$celltype, levels=c("Keratinocyte", "Fibroblast", "Immune", "Endothelial"))
```

A number of cells appear to be assigned to the wrong cluster, based on the UMAP space, set these to NA, just to be sure.
```{r}
plot_data = as.data.frame(seu_integrated@reductions$umap@cell.embeddings)
plot_data$celltype = seu_integrated@meta.data$celltype
plot_data = plot_data[plot_data$celltype=="keratinocyte",]
plot_data$selection = plot_data$UMAP_1 < -5 | plot_data$UMAP_2 > 5
plot_data$selection_fctr = factor(plot_data$selection, levels=c(TRUE, FALSE))

mycolours = c("red", "grey")
names(mycolours) = c(TRUE, FALSE)

p = ggplot(plot_data) + aes_string(x="UMAP_1", y="UMAP_2", colour="selection_fctr") + 
           geom_point(size=0.25) + scale_colour_manual(values=mycolours) +
           theme_cowplot()
print(p)
seu_integrated@meta.data$celltype[rownames(seu_integrated@meta.data) %in% rownames(plot_data)[plot_data$selection]] = NA
UMAPPlot(seu_integrated, group.by="Phase")

plot_data = as.data.frame(seu_integrated@reductions$umap@cell.embeddings)
plot_data$celltype = seu_integrated@meta.data$celltype
plot_data$celltype = unlist(lapply(plot_data$celltype, function(x) { paste(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)), sep="") }))
plot_data$celltype_fctr = factor(plot_data$celltype, levels=c("Keratinocyte", "Fibroblast", "Immune", "Endothelial"))
mycolours = brewer.pal(4, "Dark2")
p = ggplot(plot_data) + 
  aes(x=UMAP_1, y=UMAP_2, colour=celltype_fctr) + 
  geom_point(size=0.2) + 
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Cell type")
print(p)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5d.pdf"),
                   base_height=4,
                   base_width=6)
```
Plot Notch1 expression per library
```{r}
notch1_expr = data.frame(expression=tpm["Notch1", rownames(seu_integrated@meta.data)], library=seu_integrated@meta.data$library)
notch1_expr$library = factor(notch1_expr$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))
p = ggplot(notch1_expr) + aes(x=library, y=expression, fill=library) + geom_boxplot() + 
  theme_cowplot() + xlab("Library") + ylab("TPM")
p = theme_publication_plot(p, "Library", legend_aes = 1)
print(p)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_notch1_expression_all_cells.pdf"),
                   base_height=4,
                   base_width=6)
```
## Save
```{r}
save(file=paste0(fileprefix, "notch1_batch_effect_with_filter_with_adjustment.RData"), seu_integrated, markers_all_cells)
metdata = seu_integrated@meta.data
metdata$cell = rownames(metdata)
write.table(metdata, file=paste0(fileprefix, "notch1_celltypes_and_metadata.txt"), quote=F, sep="\t", row.names=F)
```

## Cell type counts per library

```{r}
celltype_count = seu_integrated@meta.data[, c("orig.ident", "celltype")] %>% group_by(orig.ident, celltype) %>% summarise(n=n()) %>% spread(celltype, n)
celltype_frac = celltype_count[, 2:ncol(celltype_count)] / rowSums(celltype_count[, 2:ncol(celltype_count)], na.rm=T)
celltype_frac = cbind(data.frame(library=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2")), celltype_frac)
celltype_frac
write.table(celltype_frac, file=paste0(fileprefix, "notch1_cell_type_counts_per_library.txt"), quote=F, sep="\t", row.names=F)


celltype_frac$library = factor(celltype_frac$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))
cell_type_frac_wider = celltype_frac %>% pivot_longer(!library)
cell_type_frac_wider$name = unlist(lapply(cell_type_frac_wider$name, function(x) { paste(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)), sep="") }))
cell_type_frac_wider$name = factor(cell_type_frac_wider$name, levels=levels(plot_dat$celltype_fctr))
p = ggplot(cell_type_frac_wider) + 
  aes(x=library, y=value, fill=name) + 
  geom_bar(position="dodge", stat="identity") + 
  scale_fill_manual(values=mycolours, na.value="grey") +
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cell type")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5e_alt1.pdf"),
                   base_height=4,
                   base_width=6)

p = ggplot(cell_type_frac_wider) + 
  aes(x=library, y=value, fill=name) + 
  geom_bar(stat="identity", position = position_fill(reverse = TRUE)) + 
  scale_fill_manual(values=mycolours, na.value="grey") +
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cell type")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5e_alt2.pdf"),
                   base_height=4,
                   base_width=6)
```

# Analysis keratinocytes

Rerun integration on keratinocytes only. Here we do not perform cell cycle adjustment, as the cell cycle is deeply connected to the tissue dynamics we're interested in
```{r, warning = FALSE, message = FALSE}
keratinocyte_barcodes = rownames(seu_integrated@meta.data)[seu_integrated@meta.data$celltype=="keratinocyte" & !is.na(seu_integrated@meta.data$celltype)]

# Initial run found these cells as an outlier cluster and expressing fibroblast markers. Here these are also removed
misidentified_fibroblasts = read.table("20201211_fibroblast_identified_as_keratinocyte.txt", header=T, stringsAsFactors=F)
keratinocyte_barcodes = keratinocyte_barcodes[keratinocyte_barcodes %in% misidentified_fibroblasts$cell[!misidentified_fibroblasts$fibroblast_identified_as_keratinocyte]]

outlier_cells = read.table("20201211_notch1_cluster_assignments_to_remove_outlier_cells.txt", header=T, stringsAsFactors=F)
outlier_cells = outlier_cells[outlier_cells$is_outlier,]
keratinocyte_barcodes = keratinocyte_barcodes[!keratinocyte_barcodes %in% outlier_cells$cell]

seu_kera = list()
for (i in 1:length(expr_matrices)) {
  seu_kera[[i]] = CreateSeuratObject(counts = expr_matrices[[i]][, colnames(expr_matrices[[i]]) %in% keratinocyte_barcodes], min.cells = 30, min.features = 3000)
  
  mito.genes <- rownames(GetAssayData(object=seu_kera[[i]]))[grepl(pattern = "^MT-", x = toupper(rownames(GetAssayData(object=seu_kera[[i]]))))]
  percent.mito <- Matrix::colSums(GetAssayData(object=seu_kera[[i]], slot="counts")[mito.genes, ]) / Matrix::colSums(GetAssayData(object=seu_kera[[i]], slot="counts"))
  seu_kera[[i]] <- AddMetaData(object = seu_kera[[i]], metadata = percent.mito, col.name = "prop.mt")
  
  # Restrict proportion MT expression to remove unhappy cells
  seu_kera[[i]] = subset(x = seu_kera[[i]], subset = prop.mt > 0.03 & prop.mt < 0.1)
  
  # Remove potential doublets
  seu[[i]] = subset(x = seu[[i]], subset = nFeature_RNA < 6500 & nCount_RNA < 55000)
  
  seu_kera[[i]] = SCTransform(seu_kera[[i]], verbose = FALSE, vars.to.regress=c("prop.mt", "nFeature_RNA", "nCount_RNA"))
  seu_kera[[i]] = CellCycleScoring(seu_kera[[i]], s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, assay = 'SCT', set.ident = TRUE)
  # Enable this when also adjusting for cell cycle
  # seu[[i]] = SCTransform(seu[[i]], verbose = FALSE, vars.to.regress=c("prop.mt", "nFeature_RNA", "nCount_RNA", "S.Score", "G2M.Score"))
  
  seu[[i]] = AddMetaData(object=seu[[i]], metadata=rep(library_names[i], length(seu[[i]]$nCount_RNA)), col.name="library")
}
```

```{r, warning = FALSE, message = FALSE}
# This is required for PrepSCTIntegration, an increase from the R default 512 to 5120 for this dataset
options(future.globals.maxSize=5120*1024^2)

seu_features = SelectIntegrationFeatures(object.list = seu_kera, nfeatures = 3000)
seu_kera = PrepSCTIntegration(object.list = seu_kera, anchor.features = seu_features, verbose = FALSE)
seu_kera = lapply(X = seu_kera, FUN = RunPCA, verbose = FALSE, features = seu_features)
seu_anchors = FindIntegrationAnchors(object.list = seu_kera, normalization.method = "SCT", anchor.features = seu_features, verbose = FALSE, reduction = "rpca")
seu_kera_integrated = IntegrateData(anchorset = seu_anchors, normalization.method = "SCT", verbose = FALSE)
library_per_cell = factor(unlist(lapply(seu, function(x) x@meta.data$library)), levels=library_names)
names(library_per_cell) = unlist(lapply(seu, function(x) rownames(x@meta.data)))
seu_kera_integrated = AddMetaData(object=seu_kera_integrated, metadata=library_per_cell, col.name="library")
```

```{r}
seu_kera_integrated = RunPCA(seu_kera_integrated, verbose = FALSE)

VlnPlot(seu_kera_integrated, group.by="orig.ident", features = c("nFeature_RNA", "nCount_RNA", "prop.mt"), ncol = 3, pt.size=0.01)
ElbowPlot(seu_kera_integrated, ndims=50)
```
This time we pick 10 dimensions, the point where the tail flattens off

```{r}
seu_kera_integrated = RunUMAP(seu_kera_integrated, dims = 1:10)
UMAPPlot(seu_kera_integrated)
p = UMAPPlot(seu_kera_integrated,group.by="library")
p = theme_publication_plot(p, "Library") + ggtitle(NULL) + xlab("UMAP 1") + ylab("UMAP 2")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_umap_keratinocytes_overlay_library.pdf"),
                   base_height=4,
                   base_width=6)


UMAPPlot(seu_kera_integrated,group.by="Phase")

plot_dat = as.data.frame(seu_kera_integrated@reductions[["umap"]]@cell.embeddings)
plot_dat$krt14 = log(tpm["Krt14", rownames(seu_kera_integrated@meta.data)])
plot_dat$tgm3 = log(tpm["Tgm3", rownames(seu_kera_integrated@meta.data)])
plot_dat$krt4 = log(tpm["Krt4", rownames(seu_kera_integrated@meta.data)])
plot_dat$lor = log(tpm["Lor", rownames(seu_kera_integrated@meta.data)])

p1 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=krt14) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Krt14"), legend_title = "log(TPM)")
p2 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=tgm3) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Tgm3"), legend_title = "log(TPM)")
p3 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=krt4) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Krt4"), legend_title = "log(TPM)")
p4 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=lor) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Lor"), legend_title = "log(TPM)")
p = plot_grid(p1, p2, p3, p4, ncol=2)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_keratinocytes_marker_expression.pdf"),
                   base_height=4,
                   base_width=6)

VlnPlot(seu_kera_integrated, group.by="orig.ident", features = "prop.mt", ncol = 1, pt.size=0.01)

FeaturePlot(seu_kera_integrated, features=c("nFeature_RNA", "nCount_RNA", "prop.mt"))

plot_dat = as.data.frame(seu_kera_integrated@reductions$umap@cell.embeddings)
plot_dat$phase = seu_kera_integrated@meta.data$Phase
plot_dat$phase = factor(plot_dat$phase, levels=c("G1", "G2M", "S"))
# mycolours = brewer.pal(4, "Dark2")
p = ggplot(plot_dat) + 
  aes(x=UMAP_1, y=UMAP_2, colour=phase) + 
  geom_point(size=0.2) + 
  xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Cell type")
print(p)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5h.pdf"),
                   base_height=4,
                   base_width=6)

phase_count = seu_kera_integrated@meta.data %>% group_by(library, Phase) %>% summarise(n=n()) %>% mutate(frac = n/sum(n))
phase_count$library = factor(phase_count$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))
p = ggplot(phase_count) + 
  aes(x=library, y=frac, fill=Phase) + 
  geom_bar(position="dodge", stat="identity") + 
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cycle phase")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5i_alt1.pdf"),
                   base_height=4,
                   base_width=6)

p = ggplot(phase_count) + 
  aes(x=library, y=frac, fill=Phase) + 
  geom_bar(stat="identity", position = position_fill(reverse = TRUE)) + 
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cycle phase")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5i_alt2.pdf"),
                   base_height=4,
                   base_width=6)

write.table(phase_count[, c("library", "Phase", "frac")] %>% pivot_wider(names_from=Phase, values_from=frac), 
            file=paste0(fileprefix, "frac_cells_library_cellcycle.txt"), quote=F, sep="\t", row.names=F)
```

Plot the cluster annotations in two ways - to match the heatmap further down. 
```{r}

seu_kera_integrated = FindNeighbors(seu_kera_integrated, dims = 1:10)
seu_kera_integrated = FindClusters(object = seu_kera_integrated, resolution = 0.6, verbose=FALSE)
markers_keratinocytes = FindAllMarkers(seu_kera_integrated, only.pos = TRUE, min.pct = 0.1, thresh.use = 0.25, verbose = F)
p <- DimPlot(object = seu_kera_integrated, reduction = "umap")
p = LabelClusters(plot = p, id = 'ident', size=6)
p = p + xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Cluster") + ggtitle(NULL)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "cluster_annotations.pdf"),
                   base_height=4,
                   base_width=6)

seu_kera_integrated@meta.data$seurat_clusters_reordered = factor(as.numeric(as.character(seu_kera_integrated@meta.data$seurat_clusters)), levels=c(8,6,9,4,10,0,1,11,3,5,2,7))
p <- DimPlot(object = seu_kera_integrated, reduction = "umap", group.by = "seurat_clusters_reordered")
p = LabelClusters(plot = p, id = 'seurat_clusters_reordered', size=6)
p = p + xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Cluster") + ggtitle(NULL)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "cluster_annotations_alt.pdf"),
                   base_height=4,
                   base_width=6)
write.table(markers_keratinocytes, file=paste0(fileprefix, "signif_expr_markers_keratinocyte_analysis.txt"), quote=F, sep="\t", row.names=F)
```

## Determine number of basal cells

```{r}
# established to broadly capture the changeover point where Krt14 drops of and Tgm3 starts going up
slope = -1.5
intercept = -4.5

add_threshold = function(p, slope, intercept) {
  return(p + geom_abline(slope=slope, intercept=intercept, colour="black", linetype=2))
}

pc1 = as.data.frame(seu_kera_integrated@reductions[["umap"]]@cell.embeddings[, c(1, 2), drop=F])
pc1$library = as.character(seu_kera_integrated@meta.data$library[match(rownames(pc1), rownames(seu_kera_integrated@meta.data))])
pc1$selected = ifelse(pc1$UMAP_2 < (intercept+slope*pc1$UMAP_1), 0, 1)
pc1$selected = factor(pc1$selected, levels=c(1, 0))
pc1$layer = "Basal"
pc1$layer[pc1$selected=="0"] = "Suprabasal"
pc1$layer = factor(pc1$layer, levels=c("Basal", "Suprabasal"))
```

Plot the expression of a number of markers relative to the selected threshold to show we've broadly captured the crossover area
```{r}
plot_dat = as.data.frame(seu_kera_integrated@reductions[["umap"]]@cell.embeddings)
plot_dat$krt14 = log(tpm["Krt14", rownames(seu_kera_integrated@meta.data)])
plot_dat$tgm3 = log(tpm["Tgm3", rownames(seu_kera_integrated@meta.data)])
plot_dat$krt4 = log(tpm["Krt4", rownames(seu_kera_integrated@meta.data)])
plot_dat$lor = log(tpm["Lor", rownames(seu_kera_integrated@meta.data)])

p1 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=krt14) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Krt14"), legend_title = "log(TPM)")
p2 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=tgm3) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Tgm3"), legend_title = "log(TPM)")
p3 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=krt4) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Krt4"), legend_title = "log(TPM)")
p4 = theme_publication_plot(ggplot(plot_dat) + aes(x=UMAP_1, y=UMAP_2, colour=lor) + geom_point(size=0.2) + xlab("UMAP 1") + ylab("UMAP 2") + theme_cowplot() + ggtitle("Lor"), legend_title = "log(TPM)")
p = plot_grid(add_threshold(p1, slope, intercept), 
              add_threshold(p2, slope, intercept), 
              add_threshold(p3, slope, intercept), 
              add_threshold(p4, slope, intercept), ncol=2)
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_basal_cell_threshold_vs_markers.pdf"),
                   base_height=4,
                   base_width=6)
```

Plot the threshold per library to show which cells have been marked as basal cells
```{r}
mycolours = c("red", "grey")
p1 = ggplot(pc1[pc1$library=="WT 1",]) +
  aes(x=UMAP_1, y=UMAP_2, colour=layer) +
  geom_point(size=0.2) +
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("WT 1") + theme_cowplot() + theme(legend.position="none")

p2 = ggplot(pc1[pc1$library=="WT 2",]) +
  aes(x=UMAP_1, y=UMAP_2, colour=layer) +
  geom_point(size=0.2) +
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("WT 2") + theme_cowplot() + theme(legend.position="none") 

p3 = ggplot(pc1[pc1$library=="HOM KO 1",]) +
  aes(x=UMAP_1, y=UMAP_2, colour=layer) +
  geom_point(size=0.2) +
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("HOM KO 1") + theme_cowplot() + theme(legend.position="none")

p4 = ggplot(pc1[pc1$library=="HOM KO 2",]) +
  aes(x=UMAP_1, y=UMAP_2, colour=layer) +
  geom_point(size=0.2) +
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("HOM KO 2") + theme_cowplot() + theme(legend.position="none")

p = plot_grid(add_threshold(p1, slope, intercept), 
              add_threshold(p2, slope, intercept), 
              add_threshold(p3, slope, intercept), 
              add_threshold(p4, slope, intercept))
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_basal_cell_classification_map.pdf"),
                   base_height=4,
                   base_width=6)
```

Summarise the calls into a table
```{r}
basal_cell_count = pc1 %>% group_by(selected, library) %>% summarise(n=n()) %>% pivot_wider(names_from=selected, values_from=n)
colnames(basal_cell_count)[2:3] = c("num_basal", "num_suprabasal")
basal_cell_count[, c("frac_basal", "frac_suprabasal")] = basal_cell_count[, 2:3] / rowSums(basal_cell_count[, 2:3])
print(basal_cell_count)
write.table(basal_cell_count, file=paste0(fileprefix, "notch1_fraction_basal_cells_umap_threshold.txt"), quote=F, sep="\t", row.names=F)
```

Make the summary figures for the paper
```{r}

mycolours = c("red", "grey")
p = ggplot(pc1) + 
  aes(x=UMAP_1, y=UMAP_2, colour=layer) + 
  geom_point(size=0.2) + 
  scale_colour_manual(values=mycolours, na.value="grey") +
  xlab("UMAP 1") + ylab("UMAP 2")
p = theme_publication_plot(p, "Layer")
print(p)

cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5f.pdf"),
                   base_height=4,
                   base_width=6)

basal_cell_count = basal_cell_count[,c("library", "frac_basal", "frac_suprabasal")]
basal_cell_count$library = factor(basal_cell_count$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))
colnames(basal_cell_count)[2:3] = c("Basal", "Suprabasal")
p = ggplot(basal_cell_count %>% pivot_longer(!library)) + 
  aes(x=library, y=value, fill=name) + 
  geom_bar(position="dodge", stat="identity") + 
  scale_fill_manual(values=mycolours, na.value="grey") +
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Layer")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5g_alt1.pdf"),
                   base_height=4,
                   base_width=6)

p = ggplot(basal_cell_count %>% pivot_longer(!library)) + 
  aes(x=library, y=value, fill=name) +
  geom_bar(stat="identity", position = position_fill(reverse = TRUE)) + 
  scale_fill_manual(values=mycolours, na.value="grey") +
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Layer")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "figure5g_alt2.pdf"),
                   base_height=4,
                   base_width=6)
```
Save the outcome
```{r}
# save the Krt4 estimates
metadat = seu_kera_integrated@meta.data
metadat$cell = rownames(metadat)

metadat = metadat[, c("cell", "orig.ident", "Phase", "prop.mt", "nCount_RNA", "nFeature_RNA")]
colnames(metadat)[2] = "library"

metadat$is_basal = pc1$layer=="Basal"
write.table(metadat, file=paste0(fileprefix, "notch1_basal_suprabasal_calls_umap_threshold.txt"), quote=F, sep="\t", row.names=F)
```


Plot percentage basal cells per cell cycle phase

```{r}
phase_count = seu_kera_integrated@meta.data[metadat$is_basal,] %>% group_by(library, Phase) %>% summarise(n=n()) %>% mutate(frac = n/sum(n))
phase_count$library = factor(phase_count$library, levels=c("WT 1", "WT 2", "HOM KO 1", "HOM KO 2"))

p = ggplot(phase_count) + 
  aes(x=library, y=frac, fill=Phase) + 
  geom_bar(stat="identity", position = position_fill(reverse = TRUE)) + 
  xlab("Library") + ylab("Proportion of cells per library")
p = theme_publication_plot(p, "Cycle phase")
print(p)

p = ggplot(phase_count) + 
  aes(x=library, y=n, fill=Phase) + 
  geom_bar(stat="identity", position = "dodge") + 
  xlab("Library") + ylab("Number of cells per library")
p = theme_publication_plot(p, "Cycle phase")
print(p)
```



```{r}
save(file=paste0(fileprefix, "notch1_keratinocytes_batch_effect_with_filter_with_adjustment.RData"), seu_kera_integrated, markers_keratinocytes)
```



## Additional plots

Downsampled plot showing equal cells per library
```{r}
counts_per_library = table(seu_integrated@meta.data$library)
set.seed(123)
selected_cells = rownames(seu_integrated@meta.data)[seu_integrated@meta.data$library=="WT 1"][sample(1:counts_per_library["WT 1"], 1500)]

selected_cells = c(selected_cells, rownames(seu_integrated@meta.data)[seu_integrated@meta.data$library=="WT 2"][sample(1:counts_per_library["WT 2"], 1500)])

selected_cells = c(selected_cells, rownames(seu_integrated@meta.data)[seu_integrated@meta.data$library=="HOM KO 1"][sample(1:counts_per_library["HOM KO 1"], 1500)])

selected_cells = c(selected_cells, rownames(seu_integrated@meta.data)[seu_integrated@meta.data$library=="HOM KO 2"][sample(1:counts_per_library["HOM KO 2"], 1500)])



seu_integrated_temp = subset(seu_integrated, cells=selected_cells)

p = UMAPPlot(seu_integrated_temp,group.by="library")
p = theme_publication_plot(p, "Library") + ggtitle(NULL) + xlab("UMAP 1") + ylab("UMAP 2")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_umap_overlay_library_downsampled1500.pdf"),
                   base_height=4,
                   base_width=6)

counts_per_library = table(seu_kera_integrated@meta.data$library)
set.seed(123)
selected_cells = rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$library=="WT 1"][sample(1:counts_per_library["WT 1"], 1400)]

selected_cells = c(selected_cells, rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$library=="WT 2"][sample(1:counts_per_library["WT 2"], 1400)])

selected_cells = c(selected_cells, rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$library=="HOM KO 1"][sample(1:counts_per_library["HOM KO 1"], 1400)])

selected_cells = c(selected_cells, rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$library=="HOM KO 2"][sample(1:counts_per_library["HOM KO 2"], 1400)])


seu_integrated_temp = subset(seu_kera_integrated, cells=selected_cells)

p = UMAPPlot(seu_integrated_temp,group.by="library")
p = theme_publication_plot(p, "Library") + ggtitle(NULL) + xlab("UMAP 1") + ylab("UMAP 2")
print(p)
cowplot::save_plot(plot=p,
                   filename = paste0(fileprefix, "supplfig_umap_overlay_library_keratinocytes_downsampled1400.pdf"),
                   base_height=4,
                   base_width=6)
```

```{r}
# load("20210717_notch1_batch_effect_with_filter_with_adjustment.RData")
```

Plot heatmap with selected marker genes from McGin et al expression per cluster

```{r}
load(paste0(fileprefix, "notch1_keratinocytes_batch_effect_with_filter_with_adjustment.RData"))
basal_markers = c("Cdh3", "Itgb1", "Krt15", "Krt14", "Krt5", "Col17a1", "Sox2", "Trp63")
cycle_markers = c("Gmnn", "Mcm6", "Mcm2", "Cdt1", "Pcna", "Ccne1", "E2f1", "Ccne2", "Cdc6", "Aurkb", "Top2a", "Ccnb2", "Bub1", "Ube2c", "Aurka", "Kif23", "Ccnb1", "Mki67", "Mad2l1", "Birc5")
diff_markers = c("Krt13", "Klf4", "Tgm3", "Sbsn", "Grhl3", "Krt4", "Notch3")
seu_kera_integrated@meta.data$seurat_clusters_reordered = factor(as.numeric(as.character(seu_kera_integrated@meta.data$seurat_clusters)), levels=c(8,6,9,4,10,0,1,11,3,5,2,7))
# p = DoHeatmap(seu_kera_integrated, features = c(basal_markers, cycle_markers, diff_markers), group.by="seurat_clusters_reordered") + NoLegend()
# print(p)
# cowplot::save_plot(plot=p,
#                     filename = "figure_heatmap_all_libraries.pdf",
#                     base_height=4,
#                     base_width=8)
```

```{r}
p = DoHeatmap(seu_kera_integrated, 
          cells = rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$orig.ident %in% c("c1", "c2")],
          features = c(basal_markers, cycle_markers, diff_markers), group.by="seurat_clusters_reordered") + NoLegend()
print(p)
cowplot::save_plot(plot=p,
                    filename = "figure_heatmap_control_libraries.pdf",
                    base_height=4,
                    base_width=8)
p = DoHeatmap(seu_kera_integrated, 
          cells = rownames(seu_kera_integrated@meta.data)[seu_kera_integrated@meta.data$orig.ident %in% c("s1", "s2")],
          features = c(basal_markers, cycle_markers, diff_markers), group.by="seurat_clusters_reordered") + NoLegend()
print(p)
cowplot::save_plot(plot=p,
                    filename = "figure_heatmap_ko_libraries.pdf",
                    base_height=4,
                    base_width=8)

```
Save a table with counts per cluster per library
```{r}
cluster_counts = list()
for (cluster in unique(seu_kera_integrated@meta.data$seurat_clusters)) {
  res = seu_kera_integrated@meta.data[seu_kera_integrated@meta.data$seurat_clusters==cluster,] %>% group_by(library) %>% summarise(n=n())
  res$cluster = cluster
  cluster_counts[[cluster]] = res
}
cluster_counts = do.call(rbind, cluster_counts)
write.table(cluster_counts, file=paste0(fileprefix, "_cluster_library_cell_counts.txt"), quote=F, sep="\t", row.names=F)
```

